#### Replicate LDA Analysis ####

# NOTE: This script is incredibly memory and computationally intensive. It will
# likely take weeks to run on a standard laptop, if it runs at all. We suggest
# only attempting to run this script on a high memory workstation (128GB RAM+,
# 20+ cores) or similar. The long runtime is due to the size of the corpus, and
# the intensive cross validation.

# Set your working directory to Data subdirectory of the replication repo. For
# us, this looks like:
setwd("~/Desktop/Replication_Materials/Data")

# Install preText:
install.packages("preText")

# Load the packages we will need:
library(preText)
library(quanteda)
library(ggplot2)
library(gridExtra)
library(topicmodels)

# Load the data:
load("Press_Releases.RData")

# load in labels for plotting:
load("128_Combination_Preprocessing_Labels.RData")

# Preprocess the data (this may take days):
press_releases_fact_prep <- factorial_preprocessing(documents,
                                                    parallel = TRUE,
                                                    cores = 20)

# Set seed for replicability
set.seed(12345)
cross_validation_splits <- 10
press_release_indices <- 1:1000

# Now we can create 10 test/train splits from this data:
press_release_train_inds <- vector(mode = "list",
                                   length = cross_validation_splits)
press_release_test_inds <- vector(mode = "list",
                                  length = cross_validation_splits)

# Loop over splits:
for (i in 1:cross_validation_splits) {
    # get the indices for the 20% test split:
    test <- sample(press_release_indices,
                   size = round(length(press_release_indices)/5),
                   replace = FALSE)
    # Go through and yank each of these out of the press release to be the
    # train indices
    train <- press_release_indices
    for (j in 1:length(test)) {
        train <- train[-which(train == test[j])]
    }

    press_release_train_inds[[i]] <- train
    press_release_test_inds[[i]] <- test
}

# This will also take days and use a large amount of memory:
press_release_cross_validation_results <- optimal_k_comparison(
    cross_validation_train_document_indicies = press_release_train_inds,
    cross_validation_test_document_indicies = press_release_test_inds,
    dfm_object_list = press_releases_fact_prep[[2]],
    topics = c(25,50,75,100,125,150,175,200),
    parallel = TRUE,
    cores = 20)

# vector to store optime number of topics in:
optimal_k <- rep(0, 64)

# extract optimal number of topics
for (i in 1:64) {
    optimal_k[i] <- press_release_cross_validation_results[[i]]$optimal_k
}

# Since we are not considering n-grams, we only need the last 64 entries in
# these vectors:
labs <- labels[65:128]
steps <- num_steps[65:128]

# create a data.frame
topics_data <- data.frame(steps = steps,
                          optimal_num_topics = optimal_k,
                          labels = labs,
                          stringsAsFactors = FALSE)
# Color for dots:
UMASS_BLUE <- rgb(51,51,153,195,maxColorValue = 255)

# Make the plot of number of steps against optimal number of topics:
pdf(file = "Press_Releases_Optimal_Num_Topics.pdf",
    width = 9,
    height = 5)
ggplot(topics_data, aes(steps, optimal_num_topics)) +
    geom_point(colour = UMASS_BLUE, size = 3) +
    geom_text(aes(label=labels),
              size=4,
              hjust=1,
              vjust=0,
              position = position_jitter(width=0, height=20)) +
    xlab("Number of Preprocessing Steps") +
    ylab("Optimal Number of Topics")
dev.off()


topic_model_fits <- vector(mode = "list", length = 64)
# now run each topic model for the optimal number of (discovered) topics:
for (i in 1:64) {
    cat("currently working on dfm:",i,"\n")
    current_dfm <- press_releases_fact_prep[[2]][[i]]
    print(Sys.time())
    fit <- topicmodels::LDA(quanteda::convert(current_dfm,
                                              to = "topicmodels"),
                            k = optimal_k[i])
    topic_model_fits[[i]] <- fit
}

# now get the top 20 terms out of each topic model fit:
top_terms_list <- vector(mode = "list", length = 64)
for (i in 1:64) {
    top_terms <- terms(topic_model_fits[[i]],20)
    top_terms_list[[i]] <- top_terms
}

# function to locate a term in the topic model top terms:
find_term <- function(vec, term) {
    tc <- 0
    for(i in 1:length(term)) {
        tc <- tc + sum(grepl(term[i],vec, ignore.case = T))
    }
    if (tc > 0) {
        return(TRUE)
    } else {
        return(FALSE)
    }
}

# look for topics containing the terms below:
# terms to search for:
search_list <- list(iraq = c("iraq"),
                    terror = c("terror"),
                    al_qaeda = c("qaeda"),
                    insurance = c("insur"),
                    stem_cell = c("stem"))

# data.frame to store results in:
num_topics <- rep(0, length = 64)
topics_in_results <- data.frame(
    preprocessing_steps = labels[65:128],
    iraq = num_topics,
    terror = num_topics,
    al_qaeda = num_topics,
    insurance = num_topics,
    stem_cell = num_topics,
    optimal_number_of_topics = optimal_k,
    stringsAsFactors = FALSE)

# Find the number of topics in which each key term appears:
for (i in 1:64) {
    print(paste("Curent DFM:",i))
    top_terms <- top_terms_list[[i]]
    for (j in 1:length(search_list)) {
        in_topic <- apply(top_terms,
                          2,
                          find_term,
                          term = search_list[[j]])
        which_topics <- which(in_topic)
        topics_in_results[i,(j+1)] <- length(which_topics)
    }
}


# make a nice looking plot of the proportion of topics in which each term occurs
# for each preprocessing specification:
pdf(file = "Topic_Model_Term_Occurence.pdf", width = 14.2,height = 6)
rescaled_results <- topic_key_term_plot(
    topic_key_term_results = topics_in_results,
    key_term_columns  = 2:6,
    custom_col_names = c("Iraq", "Terrorism", "Al Qaeda", "Insurance", "Stem Cell"),
    custom_labels = c("0%","<1%","1-2%","2-3%","3-4%","4-5%","5-6%","6-7%","7-8%",
                      "8-9%","9-10%","10%+"),
    one_matrix = FALSE,
    thresholds = c(-0.0001,0,0.0099,0.0199,0.0299,0.0399,0.0499,0.0599,0.0699,
                   0.0799,0.0899,0.0999),
    labs = labels[65:128],
    return_data = TRUE)
dev.off()

